# ------------------------------
# ' recreate "sameness" index at PUMA or county levels
# ' this code can create yearly measures as well as overall measures
# ------------------------------

# custom function to measure SD for weighted sd
weighted.sd <- function(x, w = NULL, na.rm = FALSE) {
	if (na.rm) {
		na <- is.na(x) | is.na(w)
		x <- x[!na]
		w <- w[!na]
	}
	weighted_sd = sqrt(sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1))
	return(weighted_sd)
}

#------------------------------------------------------------------------------
# load processed data : 
rdata = read_fst(file.path(processed_path,paste0('merged_','county','_v1.fst')), as.data.table=TRUE)

#------------------------------------------------------------------------------
# Data Preprocessing
rdata[, Race5 := Race4]
rdata[Hisp == 1, Race5 := 5]
rdata[Race5 == 9, Race5 := NA] 

rdata[, MarStat5 := MarStat6]
rdata[MarStat5 == 6, MarStat5 := 5]

# drop younger than 15
rdata = rdata[AgeGrp4 != 0,]

rdata[, Suic := DSID]

rdata[, St := state]
rdata = rdata[Year >= 2005,]
rdata = rdata[!is.na(St),]

rdata = rdata[,c('county','Year','uid', 'Suic', 'Sex','AgeGrp4','MarStat5','Race5','BornUSA','UnEmpl','PhysProb','county_weight','St')]
data = rdata[!is.na(county) & !is.na(Year),]

data$ObsWgt0 = data[[paste0('county','_weight')]]		

#==============================================================================
list_var = c('Sex','AgeGrp4','Race5','MarStat5','BornUSA', 'UnEmpl', 'PhysProb')

#==============================================================================
# recreate measures for % same category in their community
for (var in list_var){
	prop_data = data[, .(N=sum(ObsWgt0)), by = c('county', var)][, prop := 100 * N / sum(N), by = c('county')]
	prop_data = na.omit(prop_data)

	# add standardized variables by each category
	prop_data[,mean_prop := mean(prop), by=var] # overall mean
	prop_data[,sd_prop := sd(prop), by=var] # overall sd 
	prop_data[,std_prop := (prop - mean_prop) / sd_prop]
	prop_data[, `:=`(N = NULL, mean_prop = NULL, sd_prop = NULL)]

	data = merge(x = data, y = prop_data, by=c('county',var),all.x=TRUE)
		setnames(data, 'prop', paste0('same_prop_',var))
		setnames(data, 'std_prop', paste0('std_same_prop_',var))
}

# generate some proportion variables : gender/age/race again
data[, Female := as.integer(Sex == 2)]
data[, RAT_Female := 100 * weighted.mean(Female, ObsWgt0, na.rm=TRUE), by='county']

var_value = sort(unique(data[["AgeGrp4"]]))
for (i in var_value) {
	mean_data = data[, weighted.mean(AgeGrp4 == i, ObsWgt0, na.rm=TRUE), by='county']
	mean_data[,V1 := 100 * V1]
	setnames(mean_data, 'V1',paste0('RAT_AgeGrp4_',i))
	data = merge(x = data, y = mean_data, by=c('county'), all.x=TRUE)
}

var_value = sort(unique(data[["Race5"]]))
for (i in var_value) {
	mean_data = data[, weighted.mean(Race5 == i, ObsWgt0, na.rm=TRUE), by='county']
	mean_data[,V1 := 100 * V1]
	setnames(mean_data, 'V1',paste0('RAT_Race5_',i))
	data = merge(x = data, y = mean_data, by=c('county'), all.x=TRUE)
}

var_value = sort(unique(data[["MarStat5"]]))
for (i in var_value) {
	mean_data = data[, weighted.mean(MarStat5 == i, ObsWgt0, na.rm=TRUE), by='county']
	mean_data[,V1 := 100 * V1]
	setnames(mean_data, 'V1',paste0('RAT_MarStat5_',i))
	data = merge(x = data, y = mean_data, by=c('county'), all.x=TRUE)
}

data[, RAT_BornUSA := 100 * weighted.mean(BornUSA, ObsWgt0, na.rm=TRUE), by='county']
data[, RAT_UnEmpl := 100 * weighted.mean(UnEmpl, ObsWgt0, na.rm=TRUE), by='county']
data[, RAT_PhysProb := 100 * weighted.mean(PhysProb, ObsWgt0, na.rm=TRUE), by='county']

keepcols = c('uid','ObsWgt0','county','Year','Suic','Female','St',
	list_var,
	grep('same_',names(data),value=TRUE),
	grep('RAT_',names(data),value=TRUE)
	)
keep_data = data[,..keepcols]

outfile = file.path(processed_path, paste0('suicide_reg_','county','_v1.dta'))

export(keep_data, outfile)
